HTML剖析及網頁爬蟲實作

林嶔 (Lin, Chin)

Lesson 12

第一節:HTML基本語法介紹(1)

– 所謂的網頁背後都是被一種叫做「HTML」的語法,而你的IE、Chrome等瀏覽器其實就是負責解析「HTML」的語法,並把他呈現成你想看到的樣子。

– 你可以參考全國重度級急救責任醫院急診即時訊息總覽找到更多醫院

F12_1

F12_2

第一節:HTML基本語法介紹(2)

URL = "https://reg.ntuh.gov.tw/EmgInfoBoard/NTUHEmgInfo.aspx"

txt = scan(URL, what = "character", encoding = "UTF-8", quiet = TRUE)

head(txt, 15)
##  [1] "<!DOCTYPE"                              
##  [2] "html>"                                  
##  [3] "<html"                                  
##  [4] "xmlns=\"http://www.w3.org/1999/xhtml\">"
##  [5] "<head><title>"                          
##  [6] "國立臺灣大學醫學院附設醫院"             
##  [7] "急診即時資訊"                           
##  [8] "</title>"                               
##  [9] "<style"                                 
## [10] "type=\"text/css\">"                     
## [11] "table"                                  
## [12] ","                                      
## [13] "div"                                    
## [14] "{"                                      
## [15] "font-family:verdana,"
txt_new = paste(txt, sep = "", collapse = " ")

第一節:HTML基本語法介紹(3)

TITLE.pos = gregexpr("<title>.*</title>", txt_new)
start.TITLE.pos = TITLE.pos[[1]][1]
end.TITLE.pos = start.TITLE.pos + attr(TITLE.pos[[1]], "match.length")[1] - 1

TITLE.word = substr(txt_new, start.TITLE.pos, end.TITLE.pos)

TITLE.word
## [1] "<title> 國立臺灣大學醫學院附設醫院 急診即時資訊 </title>"
TITLE.word = gsub("<title>", "", TITLE.word)
TITLE.word = gsub("</title>", "", TITLE.word)
TITLE.word
## [1] " 國立臺灣大學醫學院附設醫院 急診即時資訊 "

第一節:HTML基本語法介紹(4)

start.pos = gregexpr("<tr>", txt_new)
end.pos = gregexpr("</tr>", txt_new)

i = 1
sub.start.pos = start.pos[[1]][i]
sub.end.pos = end.pos[[1]][i] + attr(end.pos[[1]], "match.length")[i] - 1

sub_txt = substr(txt_new, sub.start.pos, sub.end.pos)
sub_txt
## [1] "<tr> <td>等候掛號人數:</td> <td>2人</td> </tr>"
sub_txt = gsub('等候掛號人數:', '', sub_txt)
sub_txt = gsub('</?tr>', '', sub_txt)
sub_txt = gsub('</?td>', '', sub_txt)
sub_txt = gsub(' ', '', sub_txt)
sub_txt
## [1] "2人"

練習1:寫出一個函數讓我能隨時知道臺大醫院的急診即時訊息

– 這個過程叫做「網路爬蟲」,他的過程與一般手工操作是完全一樣的,因此他是合法的,但是你要注意他會對伺服器產生一定程度的負擔,請不要讓你的程式不斷的擷取資訊。

– 如果你很快就完成了台大醫院的部分,可以再試試看全國重度級急救責任醫院急診即時訊息總覽的其他醫院!

練習1答案

NTU_info = function () {
  
  result = data.frame(item = c('等候掛號人數', '等候看診人數', '等候住院人數', '等候ICU人數', '等候推床人數'),
                      info = NA,
                      stringsAsFactors = FALSE)
  
  URL = "https://reg.ntuh.gov.tw/EmgInfoBoard/NTUHEmgInfo.aspx"
  
  txt = scan(URL, what = "character", encoding = "UTF-8", quiet = TRUE)
  txt_new = paste(txt, sep = "", collapse = " ")
  
  start.pos = gregexpr("<tr>", txt_new)
  end.pos = gregexpr("</tr>", txt_new)
  
  for (i in 1:5) {
    
    sub.start.pos = start.pos[[1]][i]
    sub.end.pos = end.pos[[1]][i] + attr(end.pos[[1]], "match.length")[i] - 1
    
    sub_txt = substr(txt_new, sub.start.pos, sub.end.pos)
    sub_txt = gsub('等.*', '', sub_txt)
    sub_txt = gsub('</?tr>', '', sub_txt)
    sub_txt = gsub('</?td>', '', sub_txt)
    result[i,'info'] = gsub(' ', '', sub_txt)
    
  }

  result
  
}

NTU_info()
##           item info
## 1 等候掛號人數  2人
## 2 等候看診人數  1人
## 3 等候住院人數 48人
## 4  等候ICU人數  0人
## 5 等候推床人數  0人

第二節:利用套件執行任務(1)

– 套件「rvest」能協助我們做這件事情,套件內的函數「read_html」能協助我們讀取網頁,而函數「html_nodes」能幫助我們把某種標籤的文字萃取出來,最後「html_text」能幫助我們把標籤通通去掉:

library(rvest)

URL = "https://reg.ntuh.gov.tw/EmgInfoBoard/NTUHEmgInfo.aspx"

website = read_html(URL)

needed_txt = website %>% html_nodes("tr") %>% html_text()
needed_txt
## [1] "等候掛號人數:\r\n                        2人\r\n                    "    
## [2] "等候看診人數:\r\n                        1人\r\n                    "    
## [3] "等候住院人數:\r\n                        48人\r\n                    "   
## [4] "等候ICU人數:\r\n                        0人\r\n                    "     
## [5] "等候推床人數:\r\n                        0人\r\n                    "    
## [6] "兒科等候看診人數:\r\n                        0人\r\n                    "
## [7] "兒科等候住院人數:\r\n                        0人\r\n                    "
## [8] "兒科等候ICU人數:\r\n                        0人\r\n                    " 
## [9] "資料擷取時間:2020/5/14 下午 07:35:01\r\n                    "

第二節:利用套件執行任務(2)

F12_4

URL = "https://www.ptt.cc/bbs/AllTogether/index3245.html"
website = read_html(URL)

needed_html = website %>% html_nodes("a")
needed_html
## {xml_nodeset (58)}
##  [1] <a id="logo" href="/bbs/">批踢踢實業坊</a>
##  [2] <a class="board" href="/bbs/AllTogether/index.html"><span class="bo ...
##  [3] <a class="right small" href="/about.html">關於我們</a>
##  [4] <a class="right small" href="/contact.html">聯絡資訊</a>
##  [5] <a class="btn selected" href="/bbs/AllTogether/index.html">看板</a>
##  [6] <a class="btn" href="/man/AllTogether/index.html">精華區</a>
##  [7] <a class="btn wide" href="/bbs/AllTogether/index1.html">最舊</a>
##  [8] <a class="btn wide" href="/bbs/AllTogether/index3244.html">‹ 上頁</a>
##  [9] <a class="btn wide" href="/bbs/AllTogether/index3246.html">下頁 ›</a>
## [10] <a class="btn wide" href="/bbs/AllTogether/index.html">最新</a>
## [11] <a href="/bbs/AllTogether/M.1589386870.A.75E.html">[徵女] 台南周末享受咖啡點心</a>
## [12] <a href="/bbs/AllTogether/search?q=thread%3A%5B%E5%BE%B5%E5%A5%B3%5 ...
## [13] <a href="/bbs/AllTogether/search?q=author%3Amylucky17">搜尋看板內 myluck ...
## [14] <a href="/bbs/AllTogether/M.1589388230.A.19B.html">[徵女] 熱情開朗女孩</a>
## [15] <a href="/bbs/AllTogether/search?q=thread%3A%5B%E5%BE%B5%E5%A5%B3%5 ...
## [16] <a href="/bbs/AllTogether/search?q=author%3Akpn">搜尋看板內 kpn 的文章</a>
## [17] <a href="/bbs/AllTogether/M.1589388880.A.F92.html">[徵女] 北部醫師徵友</a>
## [18] <a href="/bbs/AllTogether/search?q=thread%3A%5B%E5%BE%B5%E5%A5%B3%5 ...
## [19] <a href="/bbs/AllTogether/search?q=author%3As2710118">搜尋看板內 s271011 ...
## [20] <a href="/bbs/AllTogether/M.1589390194.A.540.html">[徵女] 羅馬競技生死鬥</a>
## ...
needed_txt = needed_html %>% html_text()
needed_txt
##  [1] "批踢踢實業坊"                           
##  [2] "看板 AllTogether"                       
##  [3] "關於我們"                               
##  [4] "聯絡資訊"                               
##  [5] "看板"                                   
##  [6] "精華區"                                 
##  [7] "最舊"                                   
##  [8] "‹ 上頁"                                 
##  [9] "下頁 ›"                                 
## [10] "最新"                                   
## [11] "[徵女] 台南周末享受咖啡點心"            
## [12] "搜尋同標題文章"                         
## [13] "搜尋看板內 mylucky17 的文章"            
## [14] "[徵女] 熱情開朗女孩"                    
## [15] "搜尋同標題文章"                         
## [16] "搜尋看板內 kpn 的文章"                  
## [17] "[徵女] 北部醫師徵友"                    
## [18] "搜尋同標題文章"                         
## [19] "搜尋看板內 s2710118 的文章"             
## [20] "[徵女] 羅馬競技生死鬥"                  
## [21] "搜尋同標題文章"                         
## [22] "搜尋看板內 arnold3 的文章"              
## [23] "[徵男] 認真!約個凌晨出來走走路"        
## [24] "搜尋同標題文章"                         
## [25] "搜尋看板內 AngelaDepp 的文章"           
## [26] "[徵女] 桃園 尋找以結婚為前提的交往對象" 
## [27] "搜尋同標題文章"                         
## [28] "搜尋看板內 stevenyenyen 的文章"         
## [29] "[徵男]  斷訊中…"                       
## [30] "搜尋同標題文章"                         
## [31] "搜尋看板內 piecebypiece 的文章"         
## [32] "[徵女] 妳也喜歡唱歌嗎?"                
## [33] "搜尋同標題文章"                         
## [34] "搜尋看板內 pchomerex 的文章"            
## [35] "[徵女] 找今天放假的中部女孩~"           
## [36] "搜尋同標題文章"                         
## [37] "搜尋看板內 R0936245xxx 的文章"          
## [38] "[徵女] 找台中同為失戀的女孩一起出來踏青"
## [39] "搜尋同標題文章"                         
## [40] "搜尋看板內 sokalula 的文章"             
## [41] "[徵女] 與北部的妳創造彼此的緣分"        
## [42] "搜尋同標題文章"                         
## [43] "搜尋看板內 ann1236 的文章"              
## [44] "[徵女] 徵一個有緣的妳"                  
## [45] "搜尋同標題文章"                         
## [46] "搜尋看板內 AEGIS1106 的文章"            
## [47] "[徵女] 希望能在下月生日前遇見你"        
## [48] "搜尋同標題文章"                         
## [49] "搜尋看板內 kenny0629 的文章"            
## [50] "[徵女] 許給彼此一個未來"                
## [51] "搜尋同標題文章"                         
## [52] "搜尋看板內 voodist 的文章"              
## [53] "[徵女] 台南明天晚餐"                    
## [54] "搜尋同標題文章"                         
## [55] "搜尋看板內 CIANO 的文章"                
## [56] "[徵女] 中部穩定交往"                    
## [57] "搜尋同標題文章"                         
## [58] "搜尋看板內 seanliao 的文章"
intrested_pos = grep("[徵男]", needed_txt, fixed = TRUE)
needed_txt[intrested_pos]
## [1] "[徵男] 認真!約個凌晨出來走走路" "[徵男]  斷訊中…"
needed_link = needed_html[intrested_pos] %>% html_attr("href")

第二節:利用套件執行任務(3)

i = 1
sub_link = paste("https://www.ptt.cc", needed_link[i], sep = "")
sub_website = read_html(sub_link) 

article_info = sub_website %>% html_nodes(".article-meta-value")
article_info
## {xml_nodeset (4)}
## [1] <span class="article-meta-value">AngelaDepp (阿ㄐ)</span>
## [2] <span class="article-meta-value">AllTogether</span>
## [3] <span class="article-meta-value">[徵男] 認真!約個凌晨出來走走路</span>
## [4] <span class="article-meta-value">Thu May 14 01:24:34 2020</span>

練習2:請你寫出一個程式找出最近的徵男文

  1. 最新的頁面在https://www.ptt.cc/bbs/AllTogether/index.html,你需要透過下面的方式找出上一頁的連結:
URL = "https://www.ptt.cc/bbs/AllTogether/index.html"
website = read_html(URL)

website %>% html_nodes("a") %>% .[8] %>% html_attr("href")
## [1] "/bbs/AllTogether/index3246.html"
  1. 接著,從最新的頁面開始抓取徵男文的標題與連結,直到抓到10篇為止!

  2. 抓滿10篇之後,進去連結內去看看發文者ID以及時間,並把他填入表格之內

##       Title                                          
##  [1,] "[徵男] 認真!約個凌晨出來走走路"              
##  [2,] "[徵男]  斷訊中…"                             
##  [3,] "[徵男] 尋人啟事"                              
##  [4,] "[徵男] 讓我們在音樂中找到彼此"                
##  [5,] "[徵男] 期待遇見能分享生活點滴的人(新竹以北)"
##  [6,] "[徵男] 等一杯咖啡"                            
##  [7,] "[徵男] 明早霧峰桐林花廊步道看油桐花-文有真相" 
##  [8,] "[徵男] 6/26-28徵能一起出去玩的朋友"           
##  [9,] "[徵男] 台北偶爾一起發呆尬聊"                  
## [10,] "[徵男] 台中新竹從朋友當起"                    
##       url                                       
##  [1,] "/bbs/AllTogether/M.1589390676.A.4CB.html"
##  [2,] "/bbs/AllTogether/M.1589399998.A.4D5.html"
##  [3,] "/bbs/AllTogether/M.1589384025.A.7B1.html"
##  [4,] "/bbs/AllTogether/M.1589371655.A.93D.html"
##  [5,] "/bbs/AllTogether/M.1589375745.A.8BE.html"
##  [6,] "/bbs/AllTogether/M.1589376943.A.FCF.html"
##  [7,] "/bbs/AllTogether/M.1589362956.A.1A0.html"
##  [8,] "/bbs/AllTogether/M.1589308051.A.C07.html"
##  [9,] "/bbs/AllTogether/M.1589294019.A.421.html"
## [10,] "/bbs/AllTogether/M.1589296114.A.FD3.html"
##       ID                          time                      
##  [1,] "AngelaDepp (阿ㄐ)"         "Thu May 14 01:24:34 2020"
##  [2,] "piecebypiece (★)"         "Thu May 14 03:59:56 2020"
##  [3,] "happy0412 (阿妮唷!!)"      "Wed May 13 23:33:43 2020"
##  [4,] "shamam203 (妤)"            "Wed May 13 20:07:33 2020"
##  [5,] "butt0412 (小果)"           "Wed May 13 21:15:43 2020"
##  [6,] "joyce802137 (○oo妮妮●)"  "Wed May 13 21:35:41 2020"
##  [7,] "joan2662 (小鳳梨)"         "Wed May 13 17:42:34 2020"
##  [8,] "apple9999 (justneedapple)" "Wed May 13 02:27:29 2020"
##  [9,] "chana (千百個無奈)"        "Tue May 12 22:33:37 2020"
## [10,] "Apple200cc (Ellie)"        "Tue May 12 23:08:32 2020"

練習2答案

my_table = matrix("", nrow = 10, ncol = 4)
colnames(my_table) = c("Title", "url", "ID", "time")

URL = "https://www.ptt.cc/bbs/AllTogether/index.html"
current_id = 1

for (i in 1:10) {
  
  website = read_html(URL)
  needed_html = website %>% html_nodes("a")
  needed_txt = needed_html %>% html_text()
  intrested_pos = grep("[徵男]", needed_txt, fixed = TRUE)
  
  if (length(intrested_pos) > 0) {
    
    for (j in intrested_pos) {
      
      if (current_id <= 10) {
        my_table[current_id, 1] = needed_txt[j]
        my_table[current_id, 2] = needed_html[j] %>% html_attr("href")
      }
      
    current_id = current_id + 1
    
    }
    
  }
  
  if (current_id > 10) {
    break
  }
  
  next_page = website %>% html_nodes("a") %>% .[8] %>% html_attr("href")
  URL = paste0("https://www.ptt.cc", next_page, sep = "")
  
}

for (i in 1:nrow(my_table)) {
  
  sub_URL = paste("https://www.ptt.cc", my_table[i, 2], sep = "")
  sub_website = read_html(sub_URL)
  article_info = sub_website %>% html_nodes(".article-meta-value") %>% html_text()
  my_table[i, 3] = article_info[1]
  my_table[i, 4] = article_info[4]
  
}

my_table
##       Title                                          
##  [1,] "[徵男] 認真!約個凌晨出來走走路"              
##  [2,] "[徵男]  斷訊中…"                             
##  [3,] "[徵男] 尋人啟事"                              
##  [4,] "[徵男] 讓我們在音樂中找到彼此"                
##  [5,] "[徵男] 期待遇見能分享生活點滴的人(新竹以北)"
##  [6,] "[徵男] 等一杯咖啡"                            
##  [7,] "[徵男] 明早霧峰桐林花廊步道看油桐花-文有真相" 
##  [8,] "[徵男] 6/26-28徵能一起出去玩的朋友"           
##  [9,] "[徵男] 台北偶爾一起發呆尬聊"                  
## [10,] "[徵男] 台中新竹從朋友當起"                    
##       url                                       
##  [1,] "/bbs/AllTogether/M.1589390676.A.4CB.html"
##  [2,] "/bbs/AllTogether/M.1589399998.A.4D5.html"
##  [3,] "/bbs/AllTogether/M.1589384025.A.7B1.html"
##  [4,] "/bbs/AllTogether/M.1589371655.A.93D.html"
##  [5,] "/bbs/AllTogether/M.1589375745.A.8BE.html"
##  [6,] "/bbs/AllTogether/M.1589376943.A.FCF.html"
##  [7,] "/bbs/AllTogether/M.1589362956.A.1A0.html"
##  [8,] "/bbs/AllTogether/M.1589308051.A.C07.html"
##  [9,] "/bbs/AllTogether/M.1589294019.A.421.html"
## [10,] "/bbs/AllTogether/M.1589296114.A.FD3.html"
##       ID                          time                      
##  [1,] "AngelaDepp (阿ㄐ)"         "Thu May 14 01:24:34 2020"
##  [2,] "piecebypiece (★)"         "Thu May 14 03:59:56 2020"
##  [3,] "happy0412 (阿妮唷!!)"      "Wed May 13 23:33:43 2020"
##  [4,] "shamam203 (妤)"            "Wed May 13 20:07:33 2020"
##  [5,] "butt0412 (小果)"           "Wed May 13 21:15:43 2020"
##  [6,] "joyce802137 (○oo妮妮●)"  "Wed May 13 21:35:41 2020"
##  [7,] "joan2662 (小鳳梨)"         "Wed May 13 17:42:34 2020"
##  [8,] "apple9999 (justneedapple)" "Wed May 13 02:27:29 2020"
##  [9,] "chana (千百個無奈)"        "Tue May 12 22:33:37 2020"
## [10,] "Apple200cc (Ellie)"        "Tue May 12 23:08:32 2020"

第三節:使用cookie(1)

URL = 'https://www.ptt.cc/bbs/Gossiping/index.html'

website = read_html(URL)
website
## {xml_document}
## <html>
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset= ...
## [2] <body>\n\t\t\n<div class="bbs-screen bbs-content">\n    <div class=" ...

F12_5

第三節:使用cookie(2)

– 你可以透過下面這個方式找到電腦目前的cookie共有哪些:

F12_6

F12_7

第三節:使用cookie(3)

library(RCurl)

URL = 'https://www.ptt.cc/bbs/Gossiping/index.html'
curl = getCurlHandle()
curlSetOpt(cookie = "over18=1", followlocation = TRUE, curl = curl)
## An object of class "CURLHandle"
## Slot "ref":
## <pointer: 0x65012c0>
html_character = getURL(URL, curl = curl)

website = read_html(html_character)
needed_html = website %>% html_nodes("a")
needed_txt = needed_html %>% html_text()
needed_txt
##  [1] "批踢踢實業坊"                                         
##  [2] "看板 Gossiping"                                       
##  [3] "關於我們"                                             
##  [4] "聯絡資訊"                                             
##  [5] "看板"                                                 
##  [6] "精華區"                                               
##  [7] "最舊"                                                 
##  [8] "‹ 上頁"                                               
##  [9] "下頁 ›"                                               
## [10] "最新"                                                 
## [11] "Re: [問卦] 台北街頭老外乞丐變多了"                    
## [12] "搜尋同標題文章"                                       
## [13] "搜尋看板內 ghostl40809 的文章"                        
## [14] "[問卦] 全台灣最年輕最帥的毒理專家是誰?"              
## [15] "搜尋同標題文章"                                       
## [16] "搜尋看板內 wirewool 的文章"                           
## [17] "[問卦] 我不知道如何度過今夜"                          
## [18] "搜尋同標題文章"                                       
## [19] "搜尋看板內 maggie0409 的文章"                         
## [20] "Re: [新聞] 美重要智庫:中國已準備「5年內與美開戰"     
## [21] "搜尋同標題文章"                                       
## [22] "搜尋看板內 genheit 的文章"                            
## [23] "Re: Fw: [新聞] 教甄日常?名校國文290人考錄取從缺 北"  
## [24] "搜尋同標題文章"                                       
## [25] "搜尋看板內 XDDDDDDDDDD 的文章"                        
## [26] "[問卦] 有沒有醫護吃了鳳梨酥會很慘的卦 "               
## [27] "搜尋同標題文章"                                       
## [28] "搜尋看板內 J240245 的文章"                            
## [29] "[問卦] 有小英最近壓力大的八卦嗎"                      
## [30] "搜尋同標題文章"                                       
## [31] "搜尋看板內 apemonkey 的文章"                          
## [32] "[新聞] 中職/千人進場坐不滿 劉志威:打高投低不吸引人"
## [33] "搜尋同標題文章"                                       
## [34] "搜尋看板內 KingChang711 的文章"                       
## [35] "[問卦] 關於youtub廣告,億萬房屋集團?"                
## [36] "搜尋同標題文章"                                       
## [37] "搜尋看板內 sekokawana 的文章"                         
## [38] "Re: [新聞] 彩券行賣刮刮樂生意差 每月最高可獲領1萬"    
## [39] "搜尋同標題文章"                                       
## [40] "搜尋看板內 winnabe 的文章"                            
## [41] "Re: [問卦] 洗發票CP值高不高?"                        
## [42] "搜尋同標題文章"                                       
## [43] "搜尋看板內 ezJapan 的文章"                            
## [44] "[問卦] MIT出來的在台灣只能待中原嗎"                   
## [45] "搜尋同標題文章"                                       
## [46] "搜尋看板內 gino0717 的文章"                           
## [47] "Re: [新聞] 陳建仁放棄卸任禮遇 回中研院薪資最高可"     
## [48] "搜尋同標題文章"                                       
## [49] "搜尋看板內 y800122155 的文章"                         
## [50] "[問卦] 鎢絲燈跟LED燈要選哪種勒?"                     
## [51] "搜尋同標題文章"                                       
## [52] "搜尋看板內 lockinboy 的文章"                          
## [53] "Re: [問卦] 現在民進黨已經無法無天了嗎"                
## [54] "搜尋同標題文章"                                       
## [55] "搜尋看板內 obs11122 的文章"                           
## [56] "[問卦] 白粉已經無法無天了嗎?"                         
## [57] "搜尋同標題文章"                                       
## [58] "搜尋看板內 got1013 的文章"                            
## [59] "[公告] 八卦板板規(2020.04.21)"                        
## [60] "搜尋同標題文章"                                       
## [61] "搜尋看板內 seabox 的文章"                             
## [62] "[協尋] 5/11 20:00五權五街車禍行車記錄器"              
## [63] "搜尋同標題文章"                                       
## [64] "搜尋看板內 HiroXu 的文章"                             
## [65] "[協尋] 5/7 新北市 新海橋 晚上10點行車記錄器"          
## [66] "搜尋同標題文章"                                       
## [67] "搜尋看板內 niarmy 的文章"                             
## [68] "[公告] 五月份置底閒聊文"                              
## [69] "搜尋同標題文章"                                       
## [70] "搜尋看板內 Kay731 的文章"

第三節:使用cookie(4)

library(RCurl)
library(rvest)

my_table = matrix("", nrow = 10, ncol = 2)
colnames(my_table) = c("Title", "url")

URL = 'https://www.ptt.cc/bbs/Gossiping/index.html'
curl = getCurlHandle()
curlSetOpt(cookie = "over18=1", followlocation = TRUE, curl = curl)
## An object of class "CURLHandle"
## Slot "ref":
## <pointer: 0x62e9260>
current_id = 1

for (i in 1:10) {
  
  html_character = getURL(URL, curl = curl)
  website = read_html(html_character)
  
  needed_html = website %>% html_nodes("a")
  needed_txt = needed_html %>% html_text()
  intrested_pos = which(grepl("[新聞]", needed_txt, fixed = TRUE) & !grepl("Re: ", needed_txt, fixed = TRUE))
  
  if (length(intrested_pos) > 0) {
    
    for (j in intrested_pos) {
      
      if (current_id <= 10) {
        my_table[current_id, 1] = needed_txt[j]
        my_table[current_id, 2] = needed_html[j] %>% html_attr("href")
      }
      
    current_id = current_id + 1
    
    }
    
  }
  
  if (current_id > 10) {
    break
  }
  
  next_page = website %>% html_nodes("a") %>% .[8] %>% html_attr("href")
  URL = paste0("https://www.ptt.cc", next_page, sep = "")
  
}

my_table
##       Title                                                            
##  [1,] "[新聞] 中職/千人進場坐不滿 劉志威:打高投低不吸引人"          
##  [2,] "[新聞] 天花板鐵鎚徹夜狂敲一個月 他被吵到抓狂..扮蜘蛛人爬2樓陽台"
##  [3,] "[新聞]修憲案藍批綠打假球 柯建銘:藍失政權後迷"                  
##  [4,] "[新聞] 川普延長「Google禁令」華為手機恐失去"                    
##  [5,] "[新聞] 民眾黨再傳內鬥!柯貼身人曝蔡璧如對話"                    
##  [6,] "[新聞] 招名威遭家長爆:要學生出錢買書 事後反悔"                  
##  [7,] "[新聞] 騎士尾隨狂吼「為什麼要載我老婆」 夫護"                   
##  [8,] "[新聞] 手淫20秒射「白色液體」正妹裙上 噁男外"                  
##  [9,] "[新聞] 雲林六輕設海水淡化廠 台塑斥資54億元"                     
## [10,] "[新聞] 男移情別戀…他找上門談判「一刀刺中心臟"                  
##       url                                     
##  [1,] "/bbs/Gossiping/M.1589456410.A.971.html"
##  [2,] "/bbs/Gossiping/M.1589455797.A.DE0.html"
##  [3,] "/bbs/Gossiping/M.1589454899.A.AEF.html"
##  [4,] "/bbs/Gossiping/M.1589455551.A.F54.html"
##  [5,] "/bbs/Gossiping/M.1589454588.A.546.html"
##  [6,] "/bbs/Gossiping/M.1589454666.A.5A6.html"
##  [7,] "/bbs/Gossiping/M.1589454784.A.4D2.html"
##  [8,] "/bbs/Gossiping/M.1589454796.A.7EA.html"
##  [9,] "/bbs/Gossiping/M.1589454819.A.925.html"
## [10,] "/bbs/Gossiping/M.1589454202.A.F9A.html"

小結

– 當然,上課所示範的範例僅僅是網頁爬蟲的冰山一角,有許多更特別的case我們沒有辦法一個一個都示範,等你遇到的時候記得上網找教學,並試著從教學中找出該怎麼做!